#!/usr/bin/env python

from __future__ import print_function

import sys

'''
Parses records that look like:

  276  28.4  0.0  1.2  chr1       837099  837180 (248119242) +  SVA_D          Retroposon/SVA         131  211 (1175)   1082
 9901   1.8  0.4 25.1  chr1      6304394 6306236 (242650186) C  SVA_D          Retroposon/SVA         (0) 1386      1   7974
 6435   8.0  2.4  3.7  chr1      6714508 6716330 (242240092) C  SVA_D          Retroposon/SVA         (1) 1385      1   8739

Expects records in sorted order.
'''

chromosomes = ['chr'+c for c in map(str, list(range(1, 23)) + ['X', 'Y'])]

class Rmsk:
    def __init__(self, raw):
        raw = raw.strip().split()
        self.chrom = raw[4]
        self.start = int(raw[5])
        self.end   = int(raw[6])
        self.orient = raw[8]
        self.score = [1000.0-float(raw[1])*10]

        if self.orient == 'C':
            self.orient = '-'

        self.fam = raw[9]

    def near(self, other):
        assert self < other, 'record not in sorted order: %s' % str(other)
        return other.start - self.end < 500 and self.chrom == other.chrom and self.orient == other.orient

    def mean_score(self):
        return int(sum(self.score)/len(self.score))

    def __lt__(self, other):
        if self.chrom == other.chrom:
            return self.start < other.start

        return self.chrom < other.chrom

    def __str__(self):
        return '%s\t%d\t%d\t%s\t%d\t%s' % (self.chrom, self.start, self.end, self.fam, self.mean_score(), self.orient)


if len(sys.argv) == 2:
    with open(sys.argv[1], 'r') as raw:
        sva = None 
        
        for line in raw:
            if 'SVA' not in line:
                continue

            # ignore unplaced & alternate sequence
            if line.strip().split()[4] not in chromosomes:
                continue

            if sva is None:
                sva = Rmsk(line)

            else:
                rec = Rmsk(line)
                if sva.near(rec):
                    sva.end = rec.end
                    sva.score += rec.score

                else:
                    print(sva)
                    sva = rec

        print(sva)

else:
    sys.exit('usage: %s <repeatmasker .out file from UCSC>' % sys.argv[0])
